
####################################################################
####################################################################
####################################################################
####################################################################
####################################################################
## Descriptive
####################################################################
####################################################################
####################################################################
####################################################################

rm(list=ls(all=TRUE))
options(warn=-1)

setwd("Your_File_Path")

#Load Packages
library(tidyverse)
library(readxl)
require(VIM) #missing data
require(caret) #ml analysis
require(xtable) #get xtabs for latex
require(stargazer) #summary statistics
require(corrplot) #create corrplot
require(gridGraphics) #aesthetics for graphics
require(dplyr) #manipulate data
library(scales) #get pcts
require(gridExtra) #plot multiple ggplot objects

agd_plus <- read_xlsx("rubin_malone_isq_replication_data.xlsx", sheet = "agd_plus") %>%
  dplyr::select(-c(1)) %>%
  mutate(main_sample = 1)

#this is group-level dataframe
df <- read_xlsx("rubin_malone_isq_replication_data.xlsx", sheet = "agd_raw") %>%
  dplyr::select(-c(1)) %>%
  left_join(dplyr::select(agd_plus, torgid, ccode, main_sample), by = c("torgid" = "torgid", "ccode" = "ccode")) %>%
  mutate(ideology_sub = ifelse(left == 1 | center == 1 | right == 1 | religious == 1 | islam == 1 | ethnonational == 1, 1, 0)
         , main_sample = ifelse(is.na(main_sample), 0, main_sample)
  )


######################
# Data Pre-Processing
######################

#Rename Variable Names for Correlation Plot

df$NSA_Support = df$nsasup
df$LowIntensity_CivilWar = df$civwar
df$HighIntensity_CivilWar = df$civwar1000
df$StateSupport = df$statesup
df$VeteranBase = ifelse(df$primarymem=="Ex-Militants/Military", 1, 0)
df$ForeignFighters = ifelse(df$primarymem=="Foreign Fighters", 1, 0)
df$RefugeeBase = ifelse(df$primarymem=="Refugee/Exiles", 1, 0)
df$StudentBase = ifelse(df$primarymem=="Student", 1, 0)
df$ReligiousBase = ifelse(df$primarymem=="ReligComm", 1, 0)
df$Splinter = ifelse(df$typeform=="Splinter", 1, 0)
df$Merger = ifelse(df$typeform=="Merger", 1, 0)
df$PoliticalParty = ifelse(df$typeform=="Political Party Militarizes", 1, 0)
df$PoliticalWing = df$polwing
df$VeteranBase[df$ForeignFightersBase==1] = 1 #record veterans to include foreign fighters (in addition to ex-military/police and ex-rebels)
df$aim2 <- as.numeric(df$aim2)
df$Separatist = df$aim2-1
df$Leftist = df$left
df$Ethnonational = df$ethnonational
df$Transnational = df$transnationalattack

#get earliest known year of group activity based on yr of formation or yr of first attack
df$startyear = ifelse(is.na(df$yrfirstattack)==TRUE, df$yrform, df$yrfirstattack)
#roughly code whether the group began during the Cold War
df$ColdWar = ifelse(df$startyear < 1991, 1, 0)

#create categorical region variable to get summ stats by region
df$region[df$Americas==1] = "Americas"
df$region[df$MENA==1] = "Middle East/North Africa"
df$region[df$SSA==1] = "Sub-Sahara Africa"
df$region[df$Asia==1] = "Asia"
df$region[df$Europe==1] = "Europe"
df$Region = df$region


#Using VIM package, we can look for patterns of missingness:
summary(aggr(df))
aggr(df, numbers=T) 

################
# Entries
################
length(unique(df$torgid))
length(unique(df$ccode))

################
# CROSS-TABS
################

#get different cross-tabs looking at patterns of sponsorship by
# - civil war status
# - Cold War timeframe
# - whether group also receives non-state support (nsasup)
# - where the group operates (region)

xtable(table(Support=df$statesup, LIC=df$LowIntensity_CivilWar))
xtable(table(Support=df$statesup, HIC=df$HighIntensity_CivilWar))
xtable(table(Support=df$statesup, ColdWar=df$ColdWar))
xtable(table(Support=df$statesup, NonState=df$NSA_Support))
xtable(table(Support=df$statesup, Region=df$Region))

################
# SUMMARY STATS
################

df_summ = with(df, data.frame(LowIntensity_CivilWar, HighIntensity_CivilWar, #outcome variables
                              StateSupport, #whether group receives any support
                              statetypesupmaterial, #whether group receives material support
                              statetypesuptrain, #whether group receives training
                              statetypesupfinancial, #whether group receives financial assistance
                              statetypesupterr, #whether group has territorial sanctuary
                              #statetypesupother, #whether group receives logistical or undefined support
                              statetypesupendorse)) #whether group receives immaterial support/political endorsement
stargazer(df_summ, summary.stat = c("N", "min", "mean", "median", "max"))

################
## CORR MATRIX
#### Appendix Figure 1
################

df_group = with(df, data.frame(LowIntensity_CivilWar, HighIntensity_CivilWar, #civil war status
                               StateSupport, NSA_Support, #types of external support
                               Separatist, #group issue incompatibility (Territorial)
                               Leftist, Ethnonational , #group ideology
                               Splinter, Merger, PoliticalParty, #how group forms
                               EPRIrrelevant, EPRPowerless, EPRDiscriminated,  #EPR status of group in year it first conducts violence
                               Transnational,  #binary whether group conducts transnational attacks   
                               RefugeeBase, StudentBase, VeteranBase, ReligiousBase , #social base of primary/founding members (similar to FORGE) 
                               PoliticalWing)) #whether group includes polwing (similar to CGS NSA data)

M = cor(df_group, use="complete")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot_all = corrplot(M, method="color", col=col(200),  
                        type="upper", order="original", 
                        addCoef.col = "black", # Add coefficient of correlation
                        tl.col="black", tl.srt=45, tl.cex = 0.5, #Text label color and rotation
                        # hide correlation coefficient on the principal diagonal
                        diag=FALSE,   # Change font size of correlation coefficients
                        number.cex = 0.5)


corrplot(M, method="color", col=col(200),  
         type="upper", order="original", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, tl.cex = 0.5, #Text label color and rotation
         # hide correlation coefficient on the principal diagonal
         diag=FALSE,   # Change font size of correlation coefficients
         number.cex = 0.5)


#############################
## Main Sample
##############################


df4 = filter(df, main_sample > 0) %>%
  count(LowIntensity_CivilWar, StateSupport)  %>% 
  mutate(prop = round(n / sum(n),3)
         , StateSponsored = factor(StateSupport, levels = 0:1, labels = c("Yes", "No"))
         , percent = percent(prop)
         , freq_pct = paste0(as.character(n), " (", as.character(percent), ")")
  )

#create labels
df4$Category = NA
df4$Category[df4$LowIntensity_CivilWar==1 & df4$StateSupport==0] = "No Sponsorship, Civil War (25bd+)"
df4$Category[df4$LowIntensity_CivilWar==1 & df4$StateSupport==1] = "Sponsorship, Civil War  (25bd+)"
df4$Category[df4$LowIntensity_CivilWar==0 & df4$StateSupport==0] = "No Sponsorship, No Civil War"
df4$Category[df4$LowIntensity_CivilWar==0 & df4$StateSupport==1] = "Sponsorship, No Civil War"


#########################################
## Figure 1 in the Main Manuscript
#########################################

#ggplot obj
bar_chart_freq = ggplot(df4, aes(x = as.factor(StateSupport), y = n, fill = Category)) +
  geom_col(width = 1, stat = "identity", color='white')   +
  theme_bw() + guides(fill=guide_legend(ncol=2)) + 
  theme(legend.position="bottom", legend.box="vertical", legend.margin=margin(2)) + 
  geom_text(aes(label = freq_pct), position = position_stack(vjust = .5)) +
  labs(title = "Number of Armed Groups by State Sponsorship and Civil War (25bd+)") + xlab("Sponsorship") +
  ylim(0, 1200) +
  ylab("Number of Armed Groups") + scale_x_discrete(labels=c("No", "Yes")) + theme(plot.title = element_text(hjust = 0.5))
bar_chart_freq

#export ggplot to pdf
pdf("barchart_freq_main.pdf", 7,6)
bar_chart_freq
dev.off()

#ggplot obj
bar_chart_prop = ggplot(df4, aes(x = as.factor(StateSupport), y = prop, fill = Category)) +
  geom_col(width = 1, stat = "identity", color='white')   +
  theme_bw() + guides(fill=guide_legend(ncol=2)) + 
  theme(legend.position="bottom", legend.box="vertical", legend.margin=margin(2)) + 
  geom_text(aes(label = percent), position = position_stack(vjust = .5)) +
  labs(title = "Proportion of Armed Groups by State Sponsorship and Civil War (25bd+)") + xlab("Sponsorship") +
  ylab("Percentage of Armed Groups") + scale_x_discrete(labels=c("No", "Yes")) + theme(plot.title = element_text(hjust = 0.5))
bar_chart_prop

#export ggplot to pdf
pdf("barchart_prop_main.pdf", 7,6)
bar_chart_prop
dev.off()

#############################
## Sub Sample (Appendix Section D)
##############################

df5 = filter(df, main_sample > 0, ideology_sub > 0) %>%
  count(LowIntensity_CivilWar, StateSupport)  %>% 
  mutate(prop = round(n / sum(n),3)
         , StateSponsored = factor(StateSupport, levels = 0:1, labels = c("Yes", "No"))
         , percent = percent(prop)
         , freq_pct = paste0(as.character(n), " (", as.character(percent), ")")
  )

#create labels
df5$Category = NA
df5$Category[df5$LowIntensity_CivilWar==1 & df5$StateSupport==0] = "No Sponsorship, Civil War (25bd+)"
df5$Category[df5$LowIntensity_CivilWar==1 & df5$StateSupport==1] = "Sponsorship, Civil War  (25bd+)"
df5$Category[df5$LowIntensity_CivilWar==0 & df5$StateSupport==0] = "No Sponsorship, No Civil War"
df5$Category[df5$LowIntensity_CivilWar==0 & df5$StateSupport==1] = "Sponsorship, No Civil War"

#########################################
## Appendix Figure 3
#########################################

#ggplot obj
bar_chart_freq_sub = ggplot(df5, aes(x = as.factor(StateSupport), y = n, fill = Category)) +
  geom_col(width = 1, stat = "identity", color='white')   +
  theme_bw() + guides(fill=guide_legend(ncol=2)) + 
  theme(legend.position="bottom", legend.box="vertical", legend.margin=margin(2)) + 
  geom_text(aes(label = freq_pct), position = position_stack(vjust = .5)) +
  labs(title = "Number of Armed Groups by State Sponsorship and Civil War (25bd+)") + xlab("Sponsorship") +
  ylim(0, 1200) +
  ylab("Number of Armed Groups") + scale_x_discrete(labels=c("No", "Yes")) + theme(plot.title = element_text(hjust = 0.5))
bar_chart_freq_sub

#export ggplot to pdf
pdf("barchart_freq_sub.pdf", 7,6)
bar_chart_freq_sub
dev.off()


#ggplot obj
bar_chart_prop_sub = ggplot(df5, aes(x = as.factor(StateSupport), y = prop, fill = Category)) +
  geom_col(width = 1, stat = "identity", color='white')   +
  theme_bw() + guides(fill=guide_legend(ncol=2)) + 
  theme(legend.position="bottom", legend.box="vertical", legend.margin=margin(2)) + 
  geom_text(aes(label = percent), position = position_stack(vjust = .5)) +
  labs(title = "Proportion of Armed Groups by State Sponsorship and Civil War (Subsample)") + xlab("Sponsorship") +
  ylab("Percentage of Armed Groups") + scale_x_discrete(labels=c("No", "Yes")) + theme(plot.title = element_text(hjust = 0.5))
bar_chart_prop_sub

#export ggplot to pdf
pdf("barchart_prop_sub.pdf", 7,6)
bar_chart_prop_sub
dev.off()


####################################################################
####################################################################
####################################################################
####################################################################
####################################################################
## Main Analysis
####################################################################
####################################################################
####################################################################
####################################################################


rm(list=ls(all=TRUE))
options(warn=-1)
setwd("Your_File_Path")


#Load Packages

library(tidyverse)
library(readxl)

#data viz and exploratory data analysis
require(ggplot2)
library(ggcorrplot) #corr plots
require(sjPlot) #coef plot presentation
require(VIM) #missing data

#ml assessment
require(caret) #ml
library(ROSE) #correct for class imbalance
require(pROC) #ROC curve - model assessment
require(vip) #variable importance plot

#regression analysis
require(multiwayvcov)
require(lmtest)

#export results
require(texreg)
require(xtable)

###########################################################################
###########################################################################

#Load Main Data
df_full <- read_xlsx("rubin_malone_isq_replication_data.xlsx", sheet = "agd_plus") %>%
  dplyr::select(-c(1))

length(unique(df_full$torgid))
length(unique(df_full$ccode))

length(unique(df_full$torgid[df_full$LowIntensity_CivilWar==1]))
length(unique(df_full$torgid[df_full$HighIntensity_CivilWar==1]))

# basic stats for core subsample of groups
length(unique(filter(df_full, ideology_sub == 1)$torgid))
length(unique(filter(df_full, ideology_sub == 1)$ccode))

length(unique(filter(df_full, ideology_sub == 1)$torgid[filter(df_full, ideology_sub == 1)$LowIntensity_CivilWar==1]))
length(unique(filter(df_full, ideology_sub == 1)$torgid[filter(df_full, ideology_sub == 1)$HighIntensity_CivilWar==1]))


############################
## EDA
############################

###########################
## Appendix Figure 1
###########################

df2=with(df_full, data.frame(StateSupport , RefugeeBase  , 
                             PoliticalParty , Splinter , Merger  , 
                             Separatist , 
                             Leftist , Ethnonational , Islam , 
                             EPRIrrelevant , EPRPowerless , EPRDiscriminated  ,  Transnational ,   
                             StudentBase , VeteranBase , ReligiousBase , NSA_Support , PoliticalWing ,
                             Shared_Left_Ties , Shared_Ethnic_Ties , Shared_Sunni_Ties , Shared_Shia_Ties ,
                             Neigh_Rival , Avg_GDP_Neigh , Avg_Dem_Neigh , MID_Activity))
corr <- round(cor(df2, use="complete"), 2)


png("statesupport_corr.png", width=15,height=15, units = "in", res=500)
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           lab = TRUE,
           outline.col = "white",
           ggtheme = ggplot2::theme_bw,
           colors = c("#6D9EC1", "white", "#E46726"))
dev.off()



##############################
# Data Pre-Processing
##############################

#for binary outcome variable, we need to transform our outcome of interest into factor
df_full$StateSupport = as.factor(df_full$StateSupport)
levels(df_full$StateSupport) = c("none", "support")

#Partition Data into training and test set
#Set seed to make partition reproducible
set.seed(1234)

#Randomly sample 85% of observations for training data 
#More training data is better, but a good baseline is 1-20% in the test set
smp_size <- floor(0.85* nrow(df_full))
train_ind = sample(seq_len(nrow(df_full)), size = smp_size)
train = df_full[train_ind, ]
test = df_full[-train_ind, ]


#Using VIM package, we can look for patterns of missingness:
summary(aggr(train))
aggr(train, numbers=T) 

############################
## check proportions for core subsample

nrow(filter(train, ideology_sub == 1))/nrow(filter(df_full, ideology_sub == 1))
nrow(filter(test, ideology_sub == 1))/nrow(filter(df_full, ideology_sub == 1))



##############################################################################
#for k-fold CV, we randomly partition the training data into different folds
#we estimate the model on k-1 folds and then use remaining kth fold for validation
#to randomly partition for CV in caret, we need to create a reproducible list of seeds
#this is hard to accomplish because caret inherently uses some stochastic RNG
#https://stackoverflow.com/questions/32099991/seed-object-for-reproducible-results-in-parallel-operation-in-caret
#however we can fix the cv seeds (cvseeds) using the following procedure

#k is the number of folds for cv, e.g. k=10 for 10-fold cv
#repeated cv is n_resampling (method="repeatedcv")
#length = (k-folds*nresampling) + 1
set.seed(2022)
cvseeds = vector(mode = "list", length = 11) #length 11 because it is k+1
for(i in 1:10) cvseeds[[i]] <- sample.int(1000, 11) #number of tuning parameter combinations
## For the last model:
cvseeds[[11]] <- sample.int(1000, 1) 

#to use caret we need to create control function - specifying use of CV and class imbalance correction
train_control = caret::trainControl(method="cv", #training method
                                    number=10, #number of folds
                                    #repeats = 5,
                                    savePredictions=T, 
                                    classProbs=T,  # should class probabilities be returned
                                    sampling="smote",  #smote sample
                                    summaryFunction=twoClassSummary, 
                                    seeds=cvseeds)



###########################################################################
###########################################################################
###########################################################################
###########################################################################


###########################################################################
###########################################################################
## Analyses using full sample of groups
###########################################################################
###########################################################################


##############
# Task 1: Correlates of State Support
# Intuition: Use basic classification mechanism to identify most important variables
# We are going to use bread and butter method="glm", but you could also use other classification algorithms like CART
##############

#Part A: Classification (Predictive) Analysis

#Build out training model using caret
logit.model = train(StateSupport ~  
                      Separatist + 
                      Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                      EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                      PoliticalParty + Splinter + Merger  + 
                      Transnational +   
                      RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                      NSA_Support + PoliticalWing +
                      
                      Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                    data=train, #training data
                    method = "glm", #algorithm
                    family="binomial", 
                    trControl=train_control #control function (10-fold CV)
)

summary(logit.model)
nrow(logit.model$trainingData)

write_rds(logit.model, "results/sponsorship_classification_logit.rds")

logitmodel_summary <- summary(logit.model)
logitmodel_results <- as.data.frame(logitmodel_summary$coefficients)
names(logitmodel_results) <- c("coef", "se", "z", "p")

logitmodel_results_tab <- tibble(varnames = c("Intercept", logit.model$coefnames)
                                 , coef = logitmodel_results$coef
                                 , se = logitmodel_results$se
                                 , z = logitmodel_results$z
                                 , p = logitmodel_results$p
)

write_csv(logitmodel_results_tab, "results/sponsorship_classification_logit_results.csv")

logitmodel_modelstats <-  tibble(model = "logitmodel") %>%
  mutate(aic = logitmodel_summary$aic
         , deviance = logitmodel_summary$deviance
         , num_train_obs = nrow(logit.model$trainingData)
  )

write_csv(logitmodel_modelstats, "results/sponsorship_classification_logit_modelstats.csv")


#create predictions for test data to assess predictive accuracy of model
#predicted results will return 2 cols (Pr(None) and Pr(Support)) because training function specified classprobs=T
logit.pred = predict(logit.model, newdata=test, type="prob")

write_rds(logit.pred, "results/sponsorship_classification_logit_pred.rds")


#look at confusion matrix to get additional performance statistics
#use threshold 0.5 to classify Pr(State Support)
#compare predicted probability classification relative to true test$StateSupport
logit_results = confusionMatrix(as.factor(ifelse(logit.pred[,2] > 0.5, "support", "none")), 
                                test$StateSupport, positive="support")
logit_results
write_rds(logit_results, "results/sponsorship_classification_logit_confmatrix.rds")

logitmodel_confusionmatrix <- tibble(model = c("SponsorshipClass")) %>%
  mutate(bal_accuracy = c(logit_results$byClass[11])
         , sensitivity = c(logit_results$byClass[1])
         , specificity = c(logit_results$byClass[2])
         , kappa = c(logit_results$overall[2])
         , auc = c(auc(test$StateSupport, logit.pred[,2]))
  )

write_csv(logitmodel_confusionmatrix, "results/sponsorship_classification_logit_confmatrix.csv")



#check AUC for accuracy
roc.curve(test$StateSupport, logit.pred[,2])
auc(test$StateSupport, logit.pred[,2])

##############################################
## Appendix Figure 2
#### visualize ROC curve
##############################################
pdf("figures/roc_sponsorship.pdf", 6, 6)
roc.curve(test$StateSupport, logit.pred[,2])
dev.off()



#############################################
# Variable Importance
#### Figure 2B in Main Manuscript
############################################
# unlike statistical significance, this helps us assess predictive significance
vip(logit.model, num_features = 24) + theme_bw()
#extract raw variable importance (vi) scores from the model
vi_scores = vi(logit.model)
vi_scores

write_csv(vi_scores, "results/sponsorship_classification_logit_vi_scores.csv")

#create new column to vi scores assigning them to different categories
vi_scores$category = ifelse(vi_scores$Variable == "Separatist" |
                              vi_scores$Variable == "Leftist" |
                              vi_scores$Variable == "Ethnonational" |
                              vi_scores$Variable == "Islam" , "Aims & Ideology", NA)

vi_scores$category[is.na(vi_scores$category)==TRUE & (vi_scores$Variable == "Neigh_Rival" | 
                                                        vi_scores$Variable == "Avg_GDP_Neigh" |
                                                        vi_scores$Variable == "Avg_Dem_Neigh" |
                                                        vi_scores$Variable == "MID_Activity")] = "Neighborhood"
vi_scores$category[is.na(vi_scores$category)==TRUE & (vi_scores$Variable == "Shared_Left_Ties" |
                                                        vi_scores$Variable == "Shared_Ethnic_Ties" |
                                                        vi_scores$Variable == "Shared_Sunni_Ties" |
                                                        vi_scores$Variable == "Shared_Shia_Ties" )] = "Relational"
vi_scores$category[is.na(vi_scores$category)==TRUE] = "Organizational"

#vip is compatible with ggplot so can visualize model results
varimp_fullglm  = vip(vi_scores, geom="point", num_features = 29) + geom_point(aes(colour=factor(category)), size=3) + 
  scale_colour_discrete(name = "Group") + theme_bw() + 
  ggtitle("Variable Importance for Correlates of State Sponsorship")+
  theme(plot.title = element_text(hjust = 0.5)) +theme(axis.text=element_text(size=16),
                                                       axis.title=element_text(size=16,face="bold")) + theme(legend.position = c(.65, .3))
varimp_fullglm

png("figures/sponsorship_glmvarimp.png",width=7,height=8, units = "in", res=500)
varimp_fullglm
dev.off()

png("figures/sponsorship_varimp.png",width=7,height=8, units = "in", res=500)
varimp_fullglm
dev.off()

ggsave("figures/Figure_2B.jpg", width = 6.5, height = 9, units = "in")
varimp_fullglm
dev.off()



#######################################
## Part B: Regression Analysis
#### Appendix Table 2
#### Figure 2 in Main Manuscript
#######################################

#pooled model 
m0 = glm(StateSupport ~ Separatist + 
           Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
           EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
           PoliticalParty + Splinter + Merger  + 
           Transnational +   
           RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
           NSA_Support + PoliticalWing +
           
           Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
         data=train,
         family="binomial")

nrow(model.matrix(m0))
write_rds(m0, "results/m0.rds")
write.csv(coeftest(m0, vcov(m0)), "results/m0.csv")


m0_vcov_firm <- cluster.vcov(m0, train$ccode1)
m0_cl = coeftest(m0, m0_vcov_firm)
m0_cl
write_rds(m0_cl, "results/m0_cl.rds")
write.csv(m0_cl, "results/m0_cl.csv")


m0_modelstats <- tibble(model = "m0") %>%
  mutate(aic = AIC(m0)
         , bic = BIC(m0)
         , llik = logLik(m0)[1]
         , deviance = m0$deviance
         , num_obs = nrow(model.matrix(m0))
  )

write_csv(m0_modelstats, "results/m0_modelstats.csv")


#country-adj model
m1 = glm(StateSupport ~ Separatist + 
           Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
           EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
           PoliticalParty + Splinter + Merger  + 
           Transnational +   
           RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
           NSA_Support + PoliticalWing +
           
           Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity + factor(ccode1),
         data=train,
         family="binomial")

nrow(model.matrix(m1))
write_rds(m1, "results/m1.rds")
write.csv(coeftest(m1, vcov(m1)), "results/m1.csv")

m1_vcov_firm <- cluster.vcov(m1, train$ccode1)
m1_cl = coeftest(m1, m1_vcov_firm)
m1_cl

write_rds(m1_cl, "results/m1_cl.rds")
write.csv(m1_cl, "results/m1_cl.csv")

m1_modelstats <- tibble(model = "m1") %>%
  mutate(aic = AIC(m1)
         , bic = BIC(m1)
         , llik = logLik(m1)[1]
         , deviance = m1$deviance
         , num_obs = nrow(model.matrix(m1))
  )

write_csv(m1_modelstats, "results/m1_modelstats.csv")




#export results
texreg(list(m0_cl, m1_cl), omit.coef="ccode1")
texreg(list(m0, m1), omit.coef="ccode1")
wordreg(list(m0_cl,m1_cl), omit.coef=c("ccode1"), file = "mainresults_correlates.doc")
wordreg(list(m0, m1), omit.coef=c("ccode1"), file = "mainresults_correlates_rebel_nonclustered.doc")

########################################
## Figure 2A in Main Manuscript
########################################
#use sjplot to look at coefplot
spon_coef = plot_model(m0,
                       sort.coef = "sort.all", ci_method="wald",
                       transform = "exp", vcov.fun = "CL",
                       p.threshold = c(0.05, 0.01, 0.001),
                       vcov.type = "HC1", #use robust cluster standard errors
                       vcov.args = list(cluster = train$ccode1),   
                       show.values=TRUE, show.p=TRUE, vline.color = "gray90", dot.size=4, value.offset = .4,
                       value.size=10)  + theme_bw() + ggtitle(" ") +
  theme(plot.title = element_text(hjust = 0.5)) + theme(legend.text=element_text(size=28), legend.title=element_text(size=28))  + theme(axis.text=element_text(size=36),    axis.title=element_text(size=36)) + theme(plot.title = element_text(size=40))+ 
  theme(plot.title = element_text(hjust = 0.5)) 

spon_coef + scale_y_continuous(limits = c(0, 100))

png("figures/sponsorship_coefplot_1.png",width=15,height=21, units = "in", res=500)
spon_coef + scale_y_continuous(limits = c(-1, 60))
dev.off()

png("figures/sponsorship_coefplot.png",width=15,height=21, units = "in", res=500)
spon_coef
dev.off()

ggsave("figures/Figure_2A.jpg", width = 6.5, height = 7, units = "in")
spon_coef + scale_y_continuous(limits = c(-10, 100))
dev.off()


#################################################
## Task 2: Correlates of Civil War
#### Appendix Table 3
##################################################
# Intuition: Same as above, use basic classification mechanism but change the outcome of interest to civil war

#pooled regression
m_war = glm(LowIntensity_CivilWar ~ StateSupport + Separatist + 
              Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
              EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
              PoliticalParty + Splinter + Merger  + 
              Transnational +   
              RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
              NSA_Support + PoliticalWing +
              
              Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
            data=train,family="binomial")
summary(m_war)
nrow(model.matrix(m_war))
write_rds(m_war, "results/m_war.rds")
write.csv(coeftest(m_war, vcov(m_war)), "results/m_war.csv")

vcov_firm <- cluster.vcov(m_war, train$ccode1)
m_war_cl = coeftest(m_war, vcov_firm)
m_war_cl

write_rds(m_war_cl, "results/m_war_cl.rds")
write.csv(m_war_cl, "results/m_war_cl.csv")


m_war_modelstats <- tibble(model = "m_war") %>%
  mutate(aic = AIC(m_war)
         , bic = BIC(m_war)
         , llik = logLik(m_war)[1]
         , deviance = m_war$deviance
         , num_obs = nrow(model.matrix(m_war))
  )

write_csv(m_war_modelstats, "results/m_war_modelstats.csv")


require(sjPlot)

a = plot_model(m_war,
               sort.coef = "sort.all", ci_method="wald",
               transform = "exp", vcov.fun = "CL",
               p.threshold = c(0.05, 0.01, 0.001),
               vcov.type = "HC1",
               vcov.args = list(cluster = train$ccode1),   
               show.values=TRUE, show.p=TRUE, vline.color = "gray90", dot.size=4, value.offset = .4,
               value.size=10)  + theme_bw() + ggtitle(" ") +
  theme(plot.title = element_text(hjust = 0.5)) + theme(legend.text=element_text(size=28), legend.title=element_text(size=28))  + theme(axis.text=element_text(size=36),    axis.title=element_text(size=36)) + theme(plot.title = element_text(size=40))+ 
  theme(plot.title = element_text(hjust = 0.5))


png("civilwar_coefplot.png",width=15,height=21, units = "in", res=500)
a
dev.off()


##############
# Task 3: Analysis of Variance
# Intuition: Compare model performance of different models to see if including info about state sponsorship improves model fit
##############

#sponsor model: estimates Pr(Civil War) using ONLY state sponsorship
m_war_ss = glm(LowIntensity_CivilWar ~ StateSupport,
               data=train,family="binomial")
write_rds(m_war_ss, "results/m_war_ss.rds")
write.csv(coeftest(m_war_ss, vcov(m_war_ss)), "results/m_war_ss.csv")

m_war_ss_modelstats <- tibble(model = "m_war_ss") %>%
  mutate(aic = AIC(m_war_ss)
         , bic = BIC(m_war_ss)
         , llik = logLik(m_war_ss)[1]
         , deviance = m_war_ss$deviance
         , num_obs = nrow(model.matrix(m_war_ss))
  )

write_csv(m_war_ss_modelstats, "results/m_war_ss_modelstats.csv")


#Look at predictive performance
logit.pred.ss = predict(m_war_ss, newdata=test, type="response")
write_rds(logit.pred.ss, "results/m_war_ss_logitpred.rds")

logit_results_ss = confusionMatrix(as.factor(ifelse(logit.pred.ss > 0.5, 1,0)), 
                                   as.factor(test$LowIntensity_CivilWar), positive="1")
logit_results_ss
write_rds(logit_results_ss, "results/m_war_ss_logitresults.rds")

#alt measure of predictive performance
roc.ss = roc(test$LowIntensity_CivilWar, logit.pred.ss)
auc.ss = auc(test$LowIntensity_CivilWar, logit.pred.ss)



#base model: estimates Pr(Civil War) includes main correlates but NOT state sponsorship
m_war_base = glm(LowIntensity_CivilWar ~ Separatist + 
                   Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                   EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                   PoliticalParty + Splinter + Merger  + 
                   Transnational +   
                   RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                   NSA_Support + PoliticalWing +
                   
                   Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                 data=train,family="binomial")

write_rds(m_war_base, "results/m_war_base.rds")
write.csv(coeftest(m_war_base, vcov(m_war_base)), "results/m_war_base.csv")

m_war_base_modelstats <- tibble(model = "m_war_base") %>%
  mutate(aic = AIC(m_war_base)
         , bic = BIC(m_war_base)
         , llik = logLik(m_war_base)[1]
         , deviance = m_war_base$deviance
         , num_obs = nrow(model.matrix(m_war_base))
  )

write_csv(m_war_base_modelstats, "results/m_war_base_modelstats.csv")

#Look at predictive performance
logit.pred.base = predict(m_war_base, newdata=test, type="response")
write_rds(logit.pred.base, "results/m_war_base_logitpred.rds")

logit_results_base = confusionMatrix(as.factor(ifelse(logit.pred.base > 0.5, 1,0)), 
                                     as.factor(test$LowIntensity_CivilWar), positive="1")
logit_results_base
write_rds(logit_results_base, "results/m_war_base_logitresults.rds")

#alt measure of predictive performance

roc.base = roc(test$LowIntensity_CivilWar, logit.pred.base)
auc.base = auc(test$LowIntensity_CivilWar, logit.pred.base)


#full model: estimates Pr(Civil War) includes main correlates AND state sponsorship
m_war_full = glm(LowIntensity_CivilWar ~ StateSupport + Separatist + 
                   Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                   EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                   PoliticalParty + Splinter + Merger  + 
                   Transnational +   
                   RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                   NSA_Support + PoliticalWing +
                   
                   Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                 data=train,family="binomial")

write_rds(m_war_full, "results/m_war_full.rds")
write.csv(coeftest(m_war_full, vcov(m_war_full)), "results/m_war_full.csv")

m_war_full_modelstats <- tibble(model = "m_war_full") %>%
  mutate(aic = AIC(m_war_full)
         , bic = BIC(m_war_full)
         , llik = logLik(m_war_full)[1]
         , deviance = m_war_full$deviance
         , num_obs = nrow(model.matrix(m_war_full))
  )

write_csv(m_war_full_modelstats, "results/m_war_full_modelstats.csv")

#Look at predictive performance
logit.pred.full = predict(m_war_full, newdata=test, type="response")
write_rds(logit.pred.full, "results/m_war_full_logitpred.rds")


logit_results_full = confusionMatrix(as.factor(ifelse(logit.pred.full > 0.5, 1,0)), 
                                     as.factor(test$LowIntensity_CivilWar), positive="1")
logit_results_full
write_rds(logit_results_full, "results/m_war_full_logitresults.rds")

#alt measure of predictive performance
roc.full = roc(test$LowIntensity_CivilWar, logit.pred.full)
auc.full = auc(test$LowIntensity_CivilWar, logit.pred.full)


#############################################
## Table 1 in Main Manuscript
#############################################

#Compare fit of different modeling specifications
anova(m_war_ss, m_war_base)
anova(m_war_ss, m_war_full)

anova(m_war_base, m_war_full)

m_war_anova <- anova(m_war_ss, m_war_base, m_war_full)
m_war_anova
write.csv(m_war_anova, "results/m_war_anova.csv")

m_war_anova <- read.csv("results/m_war_anova.csv")

#export results of all 3 models
xtable(anova(m_war_ss, m_war_base, m_war_full))


#export results -- regression models
texreg(list(m_war_ss, m_war_base, m_war_full), omit.coef="ccode1")


#compare confusion matrix of all 3 models
logit_results_ss

logit_results_base

logit_results_full

#kappa
logit_results_ss$overall[2]
logit_results_base$overall[2]
logit_results_full$overall[2]

#accuracy/sensitivity/specificity
logit_results_ss$byClass
logit_results_base$byClass
logit_results_full$byClass

#compare AUC of all 3 models
auc.ss
auc.base
auc.full


##########################################
## Appendix Table 4
##########################################

m_war_confusionmatrix <- tibble(model = c("Base", "Sponsorship", "Full")) %>%
  mutate(bal_accuracy = c(logit_results_base$byClass[11], logit_results_ss$byClass[11], logit_results_full$byClass[11])
         , sensitivity = c(logit_results_base$byClass[1], logit_results_ss$byClass[1], logit_results_full$byClass[1])
         , specificity = c(logit_results_base$byClass[2], logit_results_ss$byClass[2], logit_results_full$byClass[2])
         , kappa = c(logit_results_base$overall[2], logit_results_ss$overall[2], logit_results_full$overall[2])
         , auc = c(auc.base, auc.ss, auc.full)
  )

write_csv(m_war_confusionmatrix, "results/m_war_confusionmatrix.csv")



#######################################################
## Figure 3 in Main Manuscript
#### visualize AUC/ROC performance of all models
#######################################################

ggroc(roc.base)
g <- ggroc(list(roc.base, roc.ss, roc.full),aes=c("linetype"), legacy.axes=T)  + 
  theme_light() +  labs(y = "Sensitivity", x = "1-Specificity") + 
  scale_linetype_manual(values=c("solid", "twodash", "dotted"), 
                        labels=c("Base", "Sponsorship", "Base + Sponsorship"), name="Model:") +  
  geom_text(data=NULL, check_overlap=TRUE, size=4, hjust = 0, aes(x=0.5, y=0.08, 
                                                                  label= paste0("AUC:", "\n Base: ", as.character(round(auc.base, 3)), "\n Sponsorship: ", as.character(round(auc.ss, 3)), "\n Base + Sponsorship: ", as.character(round(auc.full, 3)))
  )) +    
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="darkgrey", linetype="dashed") 


pdf("figures/roc_comparison.pdf", 8,7)
g
dev.off()



###########################################################################
###########################################################################
###########################################################################
###########################################################################


###########################################################################
###########################################################################
## Analyses excluding exclusively environmental or anarchist groups
###########################################################################
###########################################################################


##############
# Task 1: Correlates of State Support
# Intuition: Use basic classification mechanism to identify most important variables
# We are going to use bread and butter method="glm", but you could also use other classification algorithms like CART
##############

#Part A: Classification (Predictive) Analysis

#Build out training model using caret
logit.model.sub = train(StateSupport ~  
                          Separatist + 
                          Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                          EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                          PoliticalParty + Splinter + Merger  + 
                          Transnational +   
                          RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                          NSA_Support + PoliticalWing +
                          
                          Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                        data=filter(train, ideology_sub == 1), #training data
                        method = "glm", #algorithm
                        family="binomial", 
                        trControl=train_control #control function (10-fold CV)
)

summary(logit.model.sub)
nrow(logit.model.sub$trainingData)

write_rds(logit.model.sub, "results/sub_sponsorship_classification_logit.rds")

logitmodel_summary_sub <- summary(logit.model.sub)
logitmodel_results_sub <- as.data.frame(logitmodel_summary_sub$coefficients)
names(logitmodel_results_sub) <- c("coef", "se", "z", "p")

logitmodel_results_tab_sub <- tibble(varnames = c("Intercept", logit.model.sub$coefnames)
                                     , coef = logitmodel_results_sub$coef
                                     , se = logitmodel_results_sub$se
                                     , z = logitmodel_results_sub$z
                                     , p = logitmodel_results_sub$p
)

write_csv(logitmodel_results_tab_sub, "results/sub_sponsorship_classification_logit_results.csv")

logitmodel_modelstats_sub <-  tibble(model = "logitmodel") %>%
  mutate(aic = logitmodel_summary_sub$aic
         , deviance = logitmodel_summary_sub$deviance
         , num_train_obs = nrow(logit.model.sub$trainingData)
  )

write_csv(logitmodel_modelstats_sub, "results/sub_sponsorship_classification_logit_modelstats.csv")


#create predictions for test data to assess predictive accuracy of model
#predicted results will return 2 cols (Pr(None) and Pr(Support)) because training function specified classprobs=T
logit.pred.sub = predict(logit.model.sub, newdata=filter(test, ideology_sub == 1), type="prob")

write_rds(logit.pred.sub, "results/sub_sponsorship_classification_logit_pred.rds")


#look at confusion matrix to get additional performance statistics
#use threshold 0.5 to classify Pr(State Support)
#compare predicted probability classification relative to true test$StateSupport
logit_results_sub = confusionMatrix(as.factor(ifelse(logit.pred.sub[,2] > 0.5, "support", "none")), 
                                    filter(test,ideology_sub == 1)$StateSupport, positive="support")
logit_results_sub
write_rds(logit_results_sub, "results/sub_sponsorship_classification_logit_confmatrix.rds")

logitmodel_confusionmatrix_sub <- tibble(model = c("SponsorshipClass")) %>%
  mutate(bal_accuracy = c(logit_results_sub$byClass[11])
         , sensitivity = c(logit_results_sub$byClass[1])
         , specificity = c(logit_results_sub$byClass[2])
         , kappa = c(logit_results_sub$overall[2])
         , auc = c(auc = c(auc(filter(test,ideology_sub == 1)$StateSupport, logit.pred.sub[,2])))
  )

write_csv(logitmodel_confusionmatrix_sub, "results/sub_sponsorship_classification_logit_confmatrix.csv")



#check AUC for accuracy
roc.curve(filter(test,ideology_sub == 1)$StateSupport, logit.pred.sub[,2])
auc(filter(test,ideology_sub == 1)$StateSupport, logit.pred.sub[,2])

#visualize ROC curve
pdf("sub_roc_sponsorship.pdf", 6, 6)
roc.curve(filter(test,ideology_sub == 1)$StateSupport, logit.pred.sub[,2])
dev.off()

# Variable Importance
# unlike statistical significance, this helps us assess predictive significance
vip(logit.model.sub, num_features = 24) + theme_bw()
#extract raw variable importance (vi) scores from the model
vi_scores_sub = vi(logit.model.sub)
vi_scores_sub

write_csv(vi_scores_sub, "results/sub_sponsorship_classification_logit_vi_scores.csv")

#create new column to vi scores assigning them to different categories
vi_scores_sub$category = ifelse(vi_scores_sub$Variable == "Separatist" |
                                  vi_scores_sub$Variable == "Leftist" |
                                  vi_scores_sub$Variable == "Ethnonational" |
                                  vi_scores_sub$Variable == "Islam" , "Aims & Ideology", NA)

vi_scores_sub$category[is.na(vi_scores_sub$category)==TRUE & (vi_scores_sub$Variable == "Neigh_Rival" | 
                                                                vi_scores_sub$Variable == "Avg_GDP_Neigh" |
                                                                vi_scores_sub$Variable == "Avg_Dem_Neigh" |
                                                                vi_scores_sub$Variable == "MID_Activity")] = "Neighborhood"
vi_scores_sub$category[is.na(vi_scores_sub$category)==TRUE & (vi_scores_sub$Variable == "Shared_Left_Ties" |
                                                                vi_scores_sub$Variable == "Shared_Ethnic_Ties" |
                                                                vi_scores_sub$Variable == "Shared_Sunni_Ties" |
                                                                vi_scores_sub$Variable == "Shared_Shia_Ties" )] = "Relational"
vi_scores_sub$category[is.na(vi_scores_sub$category)==TRUE] = "Organizational"



########################################
## Appendix Figure 4B
########################################

#vip is compatible with ggplot so can visualize model results
varimp_fullglm_sub  = vip(vi_scores_sub, geom="point", num_features = 29) + geom_point(aes(colour=factor(category)), size=3) + 
  scale_colour_discrete(name = "Group") + theme_bw() + 
  #ggtitle("Variable Importance for Correlates of State Sponsorship")+
  theme(plot.title = element_text(hjust = 0.5)) +theme(axis.text=element_text(size=16),
                                                       axis.title=element_text(size=16,face="bold")) + theme(legend.position = c(.65, .3))
varimp_fullglm_sub

png("figures/sub_sponsorship_glmvarimp.png",width=7,height=8, units = "in", res=500)
varimp_fullglm_sub
dev.off()

png("figures/sub_sponsorship_varimp.png",width=7,height=8, units = "in", res=500)
varimp_fullglm_sub
dev.off()


############################################
## Part B: Regression Analysis
#### Appendix Table 5
##############################################
#pooled model 
m0_sub = glm(StateSupport ~ Separatist + 
               Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
               EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
               PoliticalParty + Splinter + Merger  + 
               Transnational +   
               RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
               NSA_Support + PoliticalWing +
               
               Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
             data=filter(train, ideology_sub == 1),
             family="binomial")

nrow(model.matrix(m0_sub))
write_rds(m0_sub, "results/sub_m0.rds")
write.csv(coeftest(m0_sub, vcov(m0_sub)), "results/sub_m0.csv")


vcov_firm <- cluster.vcov(m0_sub, filter(train, ideology_sub == 1)$ccode1)
m0_cl_sub = coeftest(m0_sub, vcov_firm)
m0_cl_sub

write_rds(m0_cl_sub, "results/sub_m0_cl.rds")
write.csv(m0_cl_sub, "results/sub_m0_cl.csv")

m0_modelstats_sub <- tibble(model = "m0") %>%
  mutate(aic = AIC(m0_sub)
         , bic = BIC(m0_sub)
         , llik = logLik(m0_sub)[1]
         , deviance = m0_sub$deviance
         , num_obs = nrow(model.matrix(m0_sub))
  )

write_csv(m0_modelstats_sub, "results/sub_m0_modelstats.csv")


#country-adj model
m1_sub = glm(StateSupport ~ Separatist + 
               Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
               EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
               PoliticalParty + Splinter + Merger  + 
               Transnational +   
               RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
               NSA_Support + PoliticalWing +
               
               Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity + factor(ccode1),
             data = filter(train, ideology_sub == 1),
             family="binomial")

nrow(model.matrix(m1_sub))
write_rds(m1_sub, "results/sub_m1.rds")
write.csv(coeftest(m1_sub, vcov(m1_sub)), "results/sub_m1.csv")

vcov_firm <- cluster.vcov(m1_sub, filter(train, ideology_sub == 1)$ccode1)
m1_cl_sub = coeftest(m1_sub, vcov_firm)
m1_cl_sub

write_rds(m1_cl_sub, "results/sub_m1_cl.rds")
write.csv(m1_cl_sub, "results/sub_m1_cl.csv")

m1_modelstats_sub <- tibble(model = "m1") %>%
  mutate(aic = AIC(m1_sub)
         , bic = BIC(m1_sub)
         , llik = logLik(m1_sub)[1]
         , deviance = m1_sub$deviance
         , num_obs = nrow(model.matrix(m1_sub))
  )

write_csv(m1_modelstats_sub, "results/sub_m1_modelstats.csv")


#################################################
## Appendix Figure 4A
#################################################

#use sjplot to look at coefplot
spon_coef_sub = plot_model(m0_sub,
                           sort.coef = "sort.all", ci_method="wald",
                           transform = "exp", vcov.fun = "CL",
                           p.threshold = c(0.05, 0.01, 0.001),
                           vcov.type = "HC1", #use robust cluster standard errors
                           vcov.args = list(cluster = filter(train, ideology_sub == 1)$ccode1),   
                           show.values=TRUE, show.p=TRUE, vline.color = "gray90", dot.size=4, value.offset = .4,
                           value.size=10)  + theme_bw() + ggtitle(" ") +
  theme(plot.title = element_text(hjust = 0.5)) + theme(legend.text=element_text(size=28), legend.title=element_text(size=28))  + theme(axis.text=element_text(size=36),    axis.title=element_text(size=36)) + theme(plot.title = element_text(size=40))+ 
  theme(plot.title = element_text(hjust = 0.5)) 

spon_coef_sub + scale_y_continuous(limits = c(0, 100))

png("figures/sub_sponsorship_coefplot_1.png",width=15,height=21, units = "in", res=500)
spon_coef_sub + scale_y_continuous(limits = c(-1, 60))
dev.off()

png("figures/sub_sponsorship_coefplot.png",width=15,height=21, units = "in", res=500)
spon_coef_sub
dev.off()

##############
## Task 2: Correlates of Civil War
#### Appendix Table 6
##############
# Intuition: Same as above, use basic classification mechanism but change the outcome of interest to civil war

#pooled regression
m_war_sub = glm(LowIntensity_CivilWar ~ StateSupport + Separatist + 
                  Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                  EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                  PoliticalParty + Splinter + Merger  + 
                  Transnational +   
                  RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                  NSA_Support + PoliticalWing +
                  
                  Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                data = filter(train, ideology_sub == 1),family="binomial")
summary(m_war_sub)
nrow(model.matrix(m_war_sub))
write_rds(m_war_sub, "results/sub_m_war.rds")
write.csv(coeftest(m_war_sub, vcov(m_war_sub)), "results/sub_m_war.csv")

vcov_firm <- cluster.vcov(m_war_sub, filter(train, ideology_sub == 1)$ccode1)
m_war_cl_sub = coeftest(m_war_sub, vcov_firm)
m_war_cl_sub

write_rds(m_war_cl_sub, "results/sub_m_war_cl.rds")
write.csv(m_war_cl_sub, "results/sub_m_war_cl.csv")

require(sjPlot)

a_sub = plot_model(m_war_sub,
                   sort.coef = "sort.all", ci_method="wald",
                   transform = "exp", vcov.fun = "CL",
                   p.threshold = c(0.05, 0.01, 0.001),
                   vcov.type = "HC1",
                   vcov.args = list(cluster = filter(train, ideology_sub == 1)$ccode1),   
                   show.values=TRUE, show.p=TRUE, vline.color = "gray90", dot.size=4, value.offset = .4,
                   value.size=10)  + theme_bw() + ggtitle(" ") +
  theme(plot.title = element_text(hjust = 0.5)) + theme(legend.text=element_text(size=28), legend.title=element_text(size=28))  + theme(axis.text=element_text(size=36),    axis.title=element_text(size=36)) + theme(plot.title = element_text(size=40))+ 
  theme(plot.title = element_text(hjust = 0.5))


png("sub_civilwar_coefplot.png",width=15,height=21, units = "in", res=500)
a_sub
dev.off()


##############
# Task 3: Analysis of Variance
# Intuition: Compare model performance of different models to see if including info about state sponsorship improves model fit
##############

#sponsor model: estimates Pr(Civil War) using ONLY state sponsorship
m_war_ss_sub = glm(LowIntensity_CivilWar ~ StateSupport,
                   data = filter(train, ideology_sub == 1),family="binomial")
write_rds(m_war_ss_sub, "results/sub_m_war_ss.rds")
write.csv(coeftest(m_war_ss_sub, vcov(m_war_ss_sub)), "results/sub_m_war_ss.csv")

m_war_ss_modelstats_sub <- tibble(model = "m_war_ss") %>%
  mutate(aic = AIC(m_war_ss_sub)
         , bic = BIC(m_war_ss_sub)
         , llik = logLik(m_war_ss_sub)[1]
         , deviance = m_war_ss_sub$deviance
         , num_obs = nrow(model.matrix(m_war_ss_sub))
  )

write_csv(m_war_ss_modelstats_sub, "results/sub_m_war_ss_modelstats.csv")

#Look at predictive performance
logit.pred.ss.sub = predict(m_war_ss_sub, newdata=filter(test, ideology_sub == 1), type="response")
write_rds(logit.pred.ss.sub, "results/sub_m_war_ss_logitpred.rds")

logit_results_ss_sub = confusionMatrix(as.factor(ifelse(logit.pred.ss.sub > 0.5, 1,0)), 
                                       as.factor(filter(test, ideology_sub == 1)$LowIntensity_CivilWar), positive="1")
logit_results_ss_sub
write_rds(logit_results_ss_sub, "results/sub_m_war_ss_logitresults.rds")


#alt measure of predictive performance
roc.ss.sub = roc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.ss.sub)
auc.ss.sub = auc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.ss.sub)



#base model: estimates Pr(Civil War) includes main correlates but NOT state sponsorship
m_war_base_sub = glm(LowIntensity_CivilWar ~ Separatist + 
                       Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                       EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                       PoliticalParty + Splinter + Merger  + 
                       Transnational +   
                       RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                       NSA_Support + PoliticalWing +
                       
                       Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                     data = filter(train, ideology_sub == 1),family="binomial")

write_rds(m_war_base_sub, "results/sub_m_war_base.rds")
write.csv(coeftest(m_war_base_sub, vcov(m_war_base_sub)), "results/sub_m_war_base.csv")

m_war_base_modelstats_sub <- tibble(model = "m_war_base") %>%
  mutate(aic = AIC(m_war_base_sub)
         , bic = BIC(m_war_base_sub)
         , llik = logLik(m_war_base_sub)[1]
         , deviance = m_war_base_sub$deviance
         , num_obs = nrow(model.matrix(m_war_base_sub))
  )

write_csv(m_war_base_modelstats_sub, "results/sub_m_war_base_modelstats.csv")


#Look at predictive performance
logit.pred.base.sub = predict(m_war_base_sub, newdata=filter(test, ideology_sub == 1), type="response")
write_rds(logit.pred.base.sub, "results/sub_m_war_base_logitpred.rds")

logit_results_base_sub = confusionMatrix(as.factor(ifelse(logit.pred.base.sub > 0.5, 1,0)), 
                                         as.factor(filter(test, ideology_sub == 1)$LowIntensity_CivilWar), positive="1")
logit_results_base_sub
write_rds(logit_results_base_sub, "results/sub_m_war_base_logitresults.rds")

#alt measure of predictive performance

roc.base.sub = roc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.base.sub)
auc.base.sub = auc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.base.sub)


#full model: estimates Pr(Civil War) includes main correlates AND state sponsorship
m_war_full_sub = glm(LowIntensity_CivilWar ~ StateSupport + Separatist + 
                       Shared_Left_Ties + Shared_Ethnic_Ties + Shared_Sunni_Ties + Shared_Shia_Ties + 
                       EPRIrrelevant + EPRPowerless + EPRDiscriminated  +
                       PoliticalParty + Splinter + Merger  + 
                       Transnational +   
                       RefugeeBase  + StudentBase + VeteranBase + ReligiousBase + 
                       NSA_Support + PoliticalWing +
                       
                       Neigh_Rival + Avg_GDP_Neigh + Avg_Dem_Neigh + MID_Activity,
                     data = filter(train, ideology_sub == 1),family="binomial")

write_rds(m_war_full_sub, "results/sub_m_war_full.rds")
write.csv(coeftest(m_war_full_sub, vcov(m_war_full_sub)), "results/sub_m_war_full.csv")

m_war_full_modelstats_sub <- tibble(model = "m_war_full") %>%
  mutate(aic = AIC(m_war_full_sub)
         , bic = BIC(m_war_full_sub)
         , llik = logLik(m_war_full_sub)[1]
         , deviance = m_war_full_sub$deviance
         , num_obs = nrow(model.matrix(m_war_full_sub))
  )

write_csv(m_war_full_modelstats_sub, "results/sub_m_war_full_modelstats.csv")

#Look at predictive performance
logit.pred.full.sub = predict(m_war_full_sub, newdata = filter(test, ideology_sub == 1), type="response")
write_rds(logit.pred.full.sub, "results/sub_m_war_full_logitpred.rds")


logit_results_full_sub = confusionMatrix(as.factor(ifelse(logit.pred.full.sub > 0.5, 1,0)), 
                                         as.factor(filter(test, ideology_sub == 1)$LowIntensity_CivilWar), positive="1")
logit_results_full_sub
write_rds(logit_results_full_sub, "results/sub_m_war_full_logitresults.rds")

#alt measure of predictive performance
roc.full.sub = roc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.full.sub)
auc.full.sub = auc(filter(test, ideology_sub == 1)$LowIntensity_CivilWar, logit.pred.full.sub)


########################################################
## Appendix Table 7
## Compare fit of different modeling specifications
########################################################
anova(m_war_ss_sub, m_war_base_sub)
anova(m_war_ss_sub, m_war_full_sub)

anova(m_war_base_sub, m_war_full_sub)

m_war_anova_sub <- anova(m_war_ss_sub, m_war_base_sub, m_war_full_sub)
m_war_anova_sub
write.csv(m_war_anova_sub, "results/sub_m_war_anova.csv")

#compare confusion matrix of all 3 models
logit_results_ss_sub

logit_results_base_sub

logit_results_full_sub

#kappa
logit_results_ss_sub$overall[2]
logit_results_base_sub$overall[2]
logit_results_full_sub$overall[2]

#accuracy/sensitivity/specificity
logit_results_ss_sub$byClass
logit_results_base_sub$byClass
logit_results_full_sub$byClass


#compare AUC of all 3 models
auc.ss.sub
auc.base.sub
auc.full.sub

#########################################################
## Appendix Table 8
#########################################################
m_war_confusionmatrix_sub <- tibble(model = c("Base", "Sponsorship", "Full")) %>%
  mutate(bal_accuracy = c(logit_results_base_sub$byClass[11], logit_results_ss_sub$byClass[11], logit_results_full_sub$byClass[11])
         , sensitivity = c(logit_results_base_sub$byClass[1], logit_results_ss_sub$byClass[1], logit_results_full_sub$byClass[1])
         , specificity = c(logit_results_base_sub$byClass[2], logit_results_ss_sub$byClass[2], logit_results_full_sub$byClass[2])
         , kappa = c(logit_results_base_sub$overall[2], logit_results_ss_sub$overall[2], logit_results_full_sub$overall[2])
         , auc = c(auc.base.sub, auc.ss.sub, auc.full.sub)
  )

write_csv(m_war_confusionmatrix_sub, "results/sub_m_war_confusionmatrix.csv")


#################################################################
## Appendix Figure 5
## visualize AUC/ROC performance of all models
##################################################################
ggroc(roc.base.sub)
g <- ggroc(list(roc.base.sub, roc.ss.sub, roc.full.sub),aes=c("linetype"), legacy.axes=T)  + 
  theme_light() +  labs(y = "Sensitivity", x = "1-Specificity") + 
  scale_linetype_manual(values=c("solid", "twodash", "dotted"), 
                        labels=c("Base", "Sponsorship", "Base + Sponsorship"), name="Model:") +  
  geom_text(data=NULL, check_overlap=TRUE, size=4, hjust = 0, aes(x=0.5, y=0.08, 
                                                                  label= paste0("AUC:", "\n Base: ", as.character(round(auc.base.sub, 3)), "\n Sponsorship: ", as.character(round(auc.ss.sub, 3)), "\n Base + Sponsorship: ", as.character(round(auc.full.sub, 3)))
  )) +    
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="darkgrey", linetype="dashed") 


pdf("figures/sub_roc_comparison.pdf", 8,7)
g
dev.off()

